0.1 Dimensionality Reduction

PCA: an orthogonal linear transformation which projects the genotype data into the new reduced dimensional space such that the greater variance comes in an order

UMAP: a novel non-linear dimensionality reduction technique based on Riemannian geometry and algebraic topology to model and preserve the high-dimensional topology of data points in the lowdimensional space

PCA–UMAP: an application of UMAP for principal components of genotype data to be computationally more advantageous and statistically less noisy

  • Diaz-Papkovich, A., et al. (2019). UMAP reveals cryptic population structure and phenotype heterogeneity in large genomic cohorts. PLOS Genetics 15(11), e1008432
  • Sakaue, S., et al. (2020). Dimensionality reduction reveals fine-scale structure in the Japanese population with consequences for polygenic risk prediction. Nature Communications 11(1), 1569
  • Karczewski, K., et al. (2020). The mutational constraint spectrum quantified from variation in 141,456 humans. Nature 581(7809), 434-443

0.2 mtReference Panel

  • Read in map/ped files
  • assign mitochondrial haplogroups using Hi-Mc
  • Convert alleles to 0/1 coding
## Haplogroup assignments 
dat <- generate_snp_data_fixed("/Users/sheaandrews/GitCode/MitoImpute/ReferencePanel/ReferencePanel.map",
                               "/Users/sheaandrews/GitCode/MitoImpute/ReferencePanel/ReferencePanel.ped")

data(nodes)
classifications.raw <- HiMC::getClassifications(dat)
classifications <- as_tibble(classifications.raw[-nrow(classifications.raw),]) #remove Mitochondria_Information row 

ped <- dat
colnames(ped) <- c("Family", "Individual", "Father", "Mother", 
                   "Sex", "Phenotype", paste0('mt', colnames(ped[, 7:ncol(ped)])))
ped <- as_tibble(ped) %>%
  na_if(., "0 0") %>% 
  mutate_at(vars(starts_with("mt")), as_factor) %>% 
  mutate_at(vars(starts_with("mt")), as.numeric) %>% 
  left_join(classifications) %>% 
  mutate(macro = ifelse(str_detect(haplogroup, "^L"),
                        substr(haplogroup, start = 1, stop = 2),
                        substr(haplogroup, start = 1, stop = 1))) %>% 
  select(Family, Individual, Father, Mother, Sex, Phenotype, haplogroup, macro, everything())

0.2.1 PCA analysis

Run PCA analysis of mtReference Panel

  • extract results for individules
  • plot 2D and 3D results
## PCA 
res.pca <- ped %>% 
  select(starts_with("mt")) %>% 
  PCA(., scale.unit = T, ncp = 10, graph = FALSE)

ind.pca <- get_pca_ind(res.pca)$coord %>% 
  as_tibble() %>%
  bind_cols(select(ped, Individual, haplogroup, macro)) %>% 
  select(Individual, haplogroup, macro, everything()) %>% 
  filter(macro %nin% c("u")) %>% 
  filter(!is.na(macro))

## Count number of mthg 
ind.pca %>% count(macro) %>% print(n = Inf)
## # A tibble: 19 x 2
##    macro     n
##    <chr> <int>
##  1 A      1176
##  2 B       143
##  3 C      1466
##  4 D      2073
##  5 H      7651
##  6 I       549
##  7 J      1733
##  8 K      1479
##  9 L0     1026
## 10 L1      837
## 11 L2     1252
## 12 L3     2251
## 13 M      5726
## 14 N      1230
## 15 R      2444
## 16 T      1607
## 17 U      3353
## 18 V       525
## 19 W       433
ggplot(ind.pca, aes(x = Dim.1, y = Dim.2, colour = macro)) + geom_point() + theme_bw()

plot_ly(ind.pca, x = ~Dim.2, y = ~Dim.1, z = ~Dim.3, color = ~macro, type = 'scatter3d', mode = 'markers', marker = list(size = 3), 
        hoverinfo = 'text', text = ~paste('</br> ID: ', Individual, 
                                          '</br> Haplogroup: ', macro,
                                          '</br> PC1: ', round(Dim.1, 2),
                                          '</br> PC2: ', round(Dim.2, 2),
                                          '</br> PC3: ', round(Dim.3, 2)))

0.2.1.1 Removing african hg (L0, L1, L2, L3)

## PCA 
oa.pca <- ped %>% 
  filter(macro %nin% c("L0", "L1", "L2", "L3")) %>% 
  select(starts_with("mt")) %>% 
  PCA(., scale.unit = T, ncp = 10, graph = FALSE)

oa.ind.pca <- get_pca_ind(oa.pca)$coord %>% 
  as_tibble() %>%
  bind_cols(select(filter(ped, macro %nin% c("L0", "L1", "L2", "L3")), Individual, haplogroup, macro)) %>% 
  select(Individual, haplogroup, macro, everything()) %>% 
  filter(macro %nin% c("u")) %>% 
  filter(!is.na(macro))
ggplot(oa.ind.pca, aes(x = Dim.1, y = Dim.2, colour = macro)) + geom_point() + theme_bw() + 
    scale_color_manual(values = cols25())

plot_ly(oa.ind.pca, x = ~Dim.2, y = ~Dim.1, z = ~Dim.3, color = ~macro, 
        type = 'scatter3d', mode = 'markers', marker = list(size = 3), 
        hoverinfo = 'text', text = ~paste('</br> ID: ', Individual, 
                                          '</br> Haplogroup: ', macro,
                                          '</br> PC1: ', round(Dim.1, 2),
                                          '</br> PC2: ', round(Dim.2, 2),
                                          '</br> PC3: ', round(Dim.3, 2)))

0.3 Thousand Genomes

0.3.1 Data Wrangling

## Haplogroup assignments 
dat_1kg <- generate_snp_data_fixed(
  "/Users/sheaandrews/GitCode/MitoImputePrep/DerivedData/ThousandGenomes/chrMT_1kg_norm_decomposed_firstAlt.map",
  "/Users/sheaandrews/GitCode/MitoImputePrep/DerivedData/ThousandGenomes/chrMT_1kg_norm_decomposed_firstAlt.ped")
info_1kg <- read_tsv("/Users/sheaandrews/GitCode/MitoImputePrep/data/ThousandGenomes/20130606_g1k.ped", guess_max = 10000) %>% 
  select(Individual = `Individual ID`, Family_ID = `Family ID`, Father = `Paternal ID`, Mother = `Maternal ID`, Population, Gender)

data(nodes)
classifications_1kg.raw <- HiMC::getClassifications(dat_1kg)
classifications_1kg <- as_tibble(classifications_1kg.raw[-nrow(classifications_1kg.raw),]) #remove Mitochondria_Information row 

ped_1kg <- dat_1kg
colnames(ped_1kg) <- c("Family", "Individual", "Father", "Mother", 
                   "Sex", "Phenotype", paste0('mt', colnames(ped_1kg[, 7:ncol(ped_1kg)])))
ped_1kg <- as_tibble(ped_1kg) %>%
  na_if(., "0 0") %>% 
  mutate_at(vars(starts_with("mt")), as_factor) %>% 
  mutate_at(vars(starts_with("mt")), as.numeric) %>% 
  left_join(classifications_1kg) %>% 
  mutate(macro = ifelse(str_detect(haplogroup, "^L"),
                        substr(haplogroup, start = 1, stop = 2),
                        substr(haplogroup, start = 1, stop = 1))) %>% 
  select(-Family, -Sex, -Father, -Mother) %>%
  left_join(info_1kg, by = "Individual") %>%
  select(Family = Family_ID, Individual, Father, Mother, Sex = Gender, Phenotype, 
         pop = Population, haplogroup, macro, full_path, everything()) %>% 
  filter(macro %nin% c("u")) %>% 
  filter(!is.na(macro)) # %>% 
  # group_by(Family) %>% 
  # slice(1) %>% 
  # ungroup()
#  filter(macro %nin% c("L0", "L1", "L2", "L3"))

miss_gt <- ped_1kg %>%
  select(everything()) %>%  # replace to your needs
  summarise_all( ~ sum(is.na(.))) %>% 
  t() %>% 
  as_tibble(rownames = "var") %>% 
  filter(V1 > 0) %>% pull(var)
count(ped_1kg, pop) %>% print(n = Inf)
count(ped_1kg, macro) %>% print(n = Inf)
count(ped_1kg, haplogroup) %>% print(n = Inf)

0.3.2 PCA

## PCA 
res.pca_1kg <- ped_1kg %>% 
  select(starts_with("mt")) %>% 
  select(-all_of(miss_gt)) %>%
#  mutate_all(., .funs = function(x){ifelse(is.na(x), round(mean(x, na.rm = TRUE)), x)}) %>%
  PCA(., scale.unit = F, ncp = 10, graph = FALSE)

ind.pca_1kg <- get_pca_ind(res.pca_1kg)$coord %>% 
  as_tibble() %>%
  bind_cols(select(ped_1kg, Individual, haplogroup, macro, pop)) %>% 
  select(Individual, haplogroup, macro, everything()) 
ggplot(ind.pca_1kg, aes(x = Dim.1, y = Dim.2, colour = macro)) + 
  geom_point() + 
  scale_color_manual(values = cols25()) + 
  theme_bw() + theme(legend.position = "bottom")

eig.val_1kg <- get_eigenvalue(res.pca_1kg)
fviz_eig(res.pca_1kg, addlabels = TRUE, ylim = c(0, 50))

plot_ly(ind.pca_1kg, x = ~Dim.2, y = ~Dim.1, z = ~Dim.3, color = ~macro, colors = cols25(), 
        type = 'scatter3d', mode = 'markers', marker = list(size = 3), 
        hoverinfo = 'text', text = ~paste('</br> ID: ', Individual, 
                                          '</br> Haplogroup: ', macro,
                                          '</br> PC1: ', round(Dim.1, 2),
                                          '</br> PC2: ', round(Dim.2, 2),
                                          '</br> PC3: ', round(Dim.3, 2)))

0.3.3 UMAP

## UMAP 
embedding_mt_2d <- ped_1kg %>% 
  select(starts_with("mt")) %>% 
  select(-all_of(miss_gt)) %>%
#  mutate_all(., .funs = function(x){ifelse(is.na(x), round(mean(x, na.rm = TRUE)), x)}) %>%
  umapr::umap(., min_dist = 0.5, n_neighbor = 30, n_components = 2) %>% 
  as.tibble() %>%
  bind_cols(select(ped_1kg, Individual, haplogroup, macro)) 
## [1] "umap-learn already installed"
embedding_mt_3d <- ped_1kg %>% 
  select(starts_with("mt")) %>% 
  select(-all_of(miss_gt)) %>%
#  mutate_all(., .funs = function(x){ifelse(is.na(x), round(mean(x, na.rm = TRUE)), x)}) %>%
  umapr::umap(., min_dist = 0.5, n_neighbor = 30, n_components = 3) %>% 
  as.tibble() %>%
  bind_cols(select(ped_1kg, Individual, haplogroup, macro)) 
## [1] "umap-learn already installed"
## PCA+UMAP
embedding_pca_2d <- select(ind.pca_1kg, starts_with("Dim")) %>% 
  umapr::umap(., min_dist = 0.5, n_neighbor = 30, n_components = 2) %>% 
  as.tibble() %>%
  bind_cols(select(ped_1kg, Individual, haplogroup, macro)) 
## [1] "umap-learn already installed"
embedding_pca_3d <- select(ind.pca_1kg, starts_with("Dim")) %>%
  umapr::umap(., min_dist = 0.5, n_neighbor = 30, n_components = 3) %>% 
  as.tibble() %>%
  bind_cols(select(ped_1kg, Individual, haplogroup, macro)) 
## [1] "umap-learn already installed"
ggpubr::ggarrange(
  ggplot(embedding_mt_2d, aes(x = UMAP1, y = UMAP2, color = macro)) + 
    geom_point() + scale_color_manual(values = cols25()) +  theme_bw() + labs(title = "UMAP"),
  ggplot(embedding_pca_2d, aes(x = UMAP1, y = UMAP2, color = macro)) + 
    geom_point() + scale_color_manual(values = cols25()) +  theme_bw() + labs(title = "PCA+UMAP"), 
  common.legend = TRUE, legend = "bottom"
)

plot_ly(embedding_mt_3d, x = ~UMAP2, y = ~UMAP1, z = ~UMAP3, color = ~macro, colors = cols25(),
        type = 'scatter3d', mode = 'markers', marker = list(size = 3),
        hoverinfo = 'text', text = ~paste('</br> ID: ', Individual, 
                                          '</br> Macro: ', macro,
                                          '</br> Haplogroup: ', haplogroup,
                                          '</br> PC1: ', round(UMAP1, 2),
                                          '</br> PC2: ', round(UMAP2, 2),
                                          '</br> PC3: ', round(UMAP3, 2))) %>%  
        layout(title="UMAP")
plot_ly(embedding_pca_3d, x = ~UMAP2, y = ~UMAP1, z = ~UMAP3, color = ~macro, colors = cols25(), 
        type = 'scatter3d', mode = 'markers', marker = list(size = 3), 
        hoverinfo = 'text', text = ~paste('</br> ID: ', Individual, 
                                          '</br> Macro: ', macro,
                                          '</br> Haplogroup: ', haplogroup,
                                          '</br> PC1: ', round(UMAP1, 2),
                                          '</br> PC2: ', round(UMAP2, 2),
                                          '</br> PC3: ', round(UMAP3, 2))) %>%  
        layout(title="PCA+UMAP")